Created
August 24, 2011 15:38
-
-
Save drio/1168330 to your computer and use it in GitHub Desktop.
finding the fastest way to extract from a fastq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib | |
KHASH_SET_INIT_STR(s) | |
#define BUF_SIZE 4096 | |
int main(int argc, char *argv[]) | |
{ | |
char buf[BUF_SIZE]; | |
FILE *fp; | |
int lineno = 0, flag = 0, ret; | |
khash_t(s) *h; | |
if (argc == 1) { | |
fprintf(stderr, "Usage: cat in.fq | fqextract <name.lst>\n"); | |
return 1; | |
} | |
h = kh_init(s); | |
fp = fopen(argv[1], "rb"); // FIXME: check fp | |
while (fgets(buf, BUF_SIZE, fp)) | |
kh_put(s, h, strdup(buf), &ret); // FIXME: check ret | |
fclose(fp); | |
while (fgets(buf, BUF_SIZE, stdin)) { | |
if (++lineno%4 == 1) | |
flag = (kh_get(s, h, buf + 1) != kh_end(h)); | |
if (flag) fputs(buf, stdout); | |
} | |
kh_destroy(s, h); // FIXME: free keys before destroy | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC=gcc | |
FLAGS=-O3 -Wall | |
FQ=input.fastq | |
LIST=list.txt | |
TMP=/stornext/snfs0/next-gen/Illumina/Instruments/700733/110803_SN733_0151_BD07UMACXX/Results/Project_110803_SN733_0151_BD07UMACXX/Sample_D07UMACXX-8-ID12/D07UMACXX-8-ID12_1_sequence.txt | |
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') | |
ifeq ($(uname_S),Linux) | |
O_FQ=$(TMP) | |
SED=sed | |
else | |
O_FQ=/Users/drio/sshfs/ardmore$(TMP) | |
SED=gsed | |
endif | |
ifndef N_READS | |
N_READS=1000000 | |
endif | |
BIN=fqextract | |
BIN_MM=mmap_fqextract | |
LINE=@printf '_________________________________________________ '; | |
all: clean run | |
run: $(FQ) $(LIST) $(BIN) $(BIN_MM) out.txt out_mmap.txt diff | |
diff: out.txt out_mmap.txt | |
$(LINE) echo "DIFF" | |
diff $+ | wc -l | |
out.txt: $(FQ) $(BIN) $(LIST) | |
$(LINE) echo "BM: $(BIN)" | |
time cat $(FQ) | ./$(BIN) $(LIST) > out.txt | |
out_mmap.txt: $(FQ) $(BIN_MM) $(LIST) | |
$(LINE) echo "BM: $(BIN_MM)" | |
time ./$(BIN_MM) $(LIST) $(FQ) > out_mmap.txt | |
khash.h: | |
curl https://raw.github.com/attractivechaos/klib/master/khash.h > $@ | |
$(FQ): $(O_FQ) | |
time head -$(N_READS) $< | awk '{print $$1}' > $@ | |
@echo "Number of reads: `cat $@ | egrep -e "^@HWI" | wc -l`" | |
$(LIST): $(FQ) | |
egrep -e "^@HWI" $< | awk '{print $$1}' | $(SED) -s 's/@//' |\ | |
ruby -ne 'puts $$_ if rand(1000) % 5 == 0' > $@ | |
wc -l $@ | |
$(BIN): $(BIN).c khash.h | |
$(CC) $(FLAGS) $< -o $@ | |
$(BIN_MM): $(BIN_MM).c khash.h | |
$(CC) $(FLAGS) $< -o $@ | |
clean: | |
rm -f out* $(LIST) $(FQ) $(BIN) $(BIN_MM) | |
.PHONY: clean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <sys/mman.h> | |
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib | |
KHASH_SET_INIT_STR(s) | |
#define BUF_SIZE 4096 | |
int main(int argc, char *argv[]) | |
{ | |
char buf[BUF_SIZE]; | |
int fd; | |
int lineno = 0, flag = 0, ret, bp = 0; | |
struct stat sb; | |
char *p, *mp; // start of mmap data, pointer within the mmap data | |
off_t len; | |
khash_t(s) *h; | |
if (argc == 2) { | |
fprintf(stderr, "Usage: fqextract <name.lst> <in.fq>\n"); | |
return 1; | |
} | |
h = kh_init(s); | |
fd = open(argv[1], O_RDONLY); // FIXME: check fp | |
fstat(fd, &sb); | |
p = mmap(0, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); | |
close(fd); | |
for (len=0; len < sb.st_size; len++) { | |
if (p[len] == '\n') { | |
buf[bp] = '\0'; | |
bp = 0; | |
kh_put(s, h, strdup(buf), &ret); // FIXME: check ret | |
} | |
else | |
buf[bp++] = p[len]; | |
} | |
munmap(p, sb.st_size); | |
fd = open(argv[2], O_RDWR); // FIXME: check fp | |
fstat(fd, &sb); | |
p = mmap(0, sb.st_size, PROT_WRITE, MAP_SHARED, fd, 0); | |
close(fd); | |
for (len=0, mp=p, lineno=0; len < sb.st_size; len++) { | |
if (p[len] == '\n') { | |
if (lineno == 0) { | |
p[len] = '\0'; | |
flag = (kh_get(s, h, mp+1) != kh_end(h)); | |
p[len] = '\n'; | |
lineno = 1; | |
} | |
else if (lineno == 3) { | |
p[len+1] = '\0'; | |
if (flag) fputs(mp, stdout); | |
p[len+1] = '@'; | |
mp = p + len + 1; | |
lineno = 0; | |
} | |
else { | |
lineno++; | |
} | |
} | |
} | |
munmap(p, sb.st_size); | |
kh_destroy(s, h); // FIXME: free keys before destroy | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I do not understand how to use this program...
I downloaded it and tried 'make all' but it failed with this error :
make: *** No rule to make target
/stornext/snfs0/next-gen/Illumina/Instruments/700733/110803_SN733_0151_BD07UMACXX/Results/Project_110803_SN733_0151_BD07UMACXX/Sample_D07UMACXX-8-ID12/D07UMACXX-8-ID12_1_sequence.txt', needed by
input.fastq'. Stop.Am I not supposed to build it ?
Any help would be very much appreciated.
Thanks !